First, we’ve gotta load in our data. This comes from my computer so disregard the file path, from the github, this would be “./data/census_2011_UK_OA.RData”

load("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/census_2011_UK_OA.RData")

Next, we must subset our data to pull census results for only Liverpool.

Census_2011_Count <- merge(Liverpool,Census_2011_Count_All,by="OA",all.x=TRUE)
 head(OAC_Input_Lookup[,])
##   VariableCode  Type Denominator      SubDomain      Domain VariableDescription
## 1         k001 Count KS102EW0001 Population Age Demographic          Age 0 to 4
## 2         k002 Count KS102EW0001 Population Age Demographic         Age 5 to 14
## 3         k003 Count KS102EW0001 Population Age Demographic        Age 25 to 44
## 4         k004 Count KS102EW0001 Population Age Demographic        Age 45 to 64
## 5         k005 Count KS102EW0001 Population Age Demographic        Age 65 to 89
## 6         k006 Count KS102EW0001 Population Age Demographic     Age 90 and over
##                         England_Wales
## 1                         KS102EW0002
## 2 KS102EW0003,KS102EW0004,KS102EW0005
## 3             KS102EW0010,KS102EW0011
## 4             KS102EW0012,KS102EW0013
## 5 KS102EW0014,KS102EW0015,KS102EW0016
## 6                         KS102EW0017

The next phase of the analysis is to aggregate all variables for OAC subgroups into their respective supergroups. Looking at the data frame OAC_Input_Lookup above, we can see that the variable code given to each region has three OAC groups (under the england-wales column). We must write code to aggregate these.

#first we pull a column of all OA codes
OAC_Input <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input) <- "OA"

# Create a loop that goes through every row in the OAC_Input_Lookup table
for (n in 1:nrow(OAC_Input_Lookup)){

      # Pull the relevant variables for every row n that need to be aggregated (this is the OAC values within England_Wales)
      select_vars <- OAC_Input_Lookup[n,"England_Wales"]
      
      # Change the string to a list of these variables for searching
      select_vars <- unlist(strsplit(paste(select_vars),","))
      
      # Create name from variable code column for each n 
      vname <- OAC_Input_Lookup[n,"VariableCode"] 
      
      # Use a temporary data frame to create a sum of census data counting the relevant England_Wales variables from before, name it with the variable code name from the above section.
      
      tmp <- data.frame(rowSums(Census_2011_Count[,select_vars, drop=FALSE]))
      colnames(tmp) <- vname
      
      # Add these new sums to the original OAC_Input column of names from the first step.
      OAC_Input <- cbind(OAC_Input,tmp)
      
      # Remove temporary objects
      remove(list = c("vname","tmp"))
}

Now that we have aggregate variables in every OAC_Input, we first much pull the k35 variables off the OAC_Input to be calculated later:

OAC_Input$k035 <- NULL

Next, we must create our denomonators from the same census file.

#Load new list of OAC inputs to append other variables to
OAC_Input_den <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input_den) <- "OA"

# Create a list of unique denominators
den_list <- unique(OAC_Input_Lookup[,"Denominator"])
den_list <- paste(den_list[den_list != ""])

# Select denominators
OAC_Input_den <- Census_2011_Count[,c("OA",den_list)]

We then merge these denomonators with the OAC_Input data frame containing our numerators, using our OA value as a key.

OAC_Input <- merge(OAC_Input,OAC_Input_den, by="OA")

Now that we have both numerators and denomonators, we create a data frame with both isolated.

To do this, we perform a similar task to above and loop through all OA values to create an initial list called K-Var:

K_Var <- OAC_Input_Lookup[OAC_Input_Lookup$Type == "Count",c(1,3)]

Then we perform the looping to match our percentages

# We set up another data frame with OA values to append everything to
OAC_Input_PCT_RATIO <- subset(OAC_Input, select = "OA")

# Loop loop loop 
for (n in 1:nrow(K_Var)){
  
  num <- paste(K_Var[n,"VariableCode"]) # Grab the code corresponding to our numerator
  den <- paste(K_Var[n,"Denominator"]) # Grab the code corresponding to our denominator
  tmp <- data.frame(OAC_Input[,num] / OAC_Input[,den] * 100) # Match this to OAC values to find our percentages for every variable
  colnames(tmp) <- num #do the temporary name thing
  OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,tmp) # Append to our above data frame with all the OA values
  
  # Remove temporary objects
  remove(list = c("tmp","num","den"))
}

This will give us a list of percentages for every variable within Liverpool. We can use this to perform our spatial analysis.

First, however, we need to add population density from the original census data:

#Extract density from original census data
tmp <- Census_2011_Count[,c("OA","KS101EW0008")]
colnames(tmp) <- c("OA","k007")

#Merge with percentages data frame
OAC_Input_PCT_RATIO <- merge(OAC_Input_PCT_RATIO,tmp,by="OA")

Finally, we must add in the SIR (standardized illness rate) that we removed earlier.

# Calculate rates of ill people 15 or less and greater than or equal to 65
ill_16_64 <- rowSums(Census_2011_Count[,c("KS301EW0005","KS301EW0006")]) # Ill people 16-64
ill_total <-   rowSums(Census_2011_Count[,c("KS301EW0002","KS301EW0003")]) # All ill people
ill_L15_G65 <- ill_total - ill_16_64 # Ill people 15 or less and greater than or equal to 65

# Calculate total people 15 or less and greater than or equal to 65
t_pop_16_64 <- rowSums(Census_2011_Count[,c("KS102EW0007","KS102EW0008","KS102EW0009","KS102EW0010","KS102EW0011","KS102EW0012","KS102EW0013")]) # People 16-64
t_pop <- Census_2011_Count$KS101EW0001 # All people
t_pop_L15_G65 <- t_pop - t_pop_16_64 # All people 15 or less and greater than or equal to 65

# Calculate expected rate
ex_ill_16_64 <- t_pop_16_64 * (sum(ill_16_64)/sum(t_pop_16_64)) # Expected ill 16-64
ex_ill_L15_G65 <- t_pop_L15_G65 * (sum(ill_L15_G65)/sum(t_pop_L15_G65)) # Expected ill people 15 or less and greater than or equal to 65

ex_ill <- ex_ill_16_64 + ex_ill_L15_G65 # total expected ill people

# Ratio
SIR <- as.data.frame(ill_total / ex_ill * 100) # ratio between ill people and expected ill people
colnames(SIR) <- "k035"

# Merge data
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,SIR)

# Remove unwanted objects
remove(list=c("SIR","ill_16_64","ill_total","ill_L15_G65","t_pop_16_64","t_pop","t_pop_L15_G65","ex_ill_16_64","ex_ill_L15_G65","ex_ill"))

Next, we must normalize and standardize all our proportions added into OAC_Input_PCT_RATIO. We use the inverse hyperbolic sine and then a range standardization.

# Inverse hyperbolic sine function
OAC_Input_PCT_RATIO_IHS <- log(OAC_Input_PCT_RATIO[,2:61]+sqrt(OAC_Input_PCT_RATIO[,2:61]^2+1))

# Range Standardization
range_01 <- function(x){(x-min(x))/(max(x)-min(x))} # range function
OAC_Input_PCT_RATIO_IHS_01 <- apply(OAC_Input_PCT_RATIO_IHS, 2, range_01) # apply range function to columns

# Add the OA codes back as row names
rownames(OAC_Input_PCT_RATIO_IHS_01) <- OAC_Input_PCT_RATIO$OA

In total, this gives us the normalized and standardized variables for each of the social indicator variables included in the original 2011 OAC study for Liverpool. The next step of this analysis requires us to find the ideal number of clusters for our aggregation level.

We can do this by using a total within sum of squares calculation. This is essentially a “mean standardized distance of the data observations to a cluster mean”

library(ggplot2)

# Create a new empty numeric object to store the within sum of squares values
wss <- numeric()

# Run k means analysis for 2-12 clusters and store the wss results
for (i in 2:12) wss[i] <- sum(kmeans(OAC_Input_PCT_RATIO_IHS_01, centers=i,nstart=20)$withinss)

# Create a data frame with the results, adding a further column for the cluster number
wss <- data.frame(2:12,wss[-1])

# Plot
names(wss) <- c("k","Twss")
ggplot(data=wss, aes(x= k, y=Twss)) + geom_path() + geom_point() + scale_x_continuous(breaks=2:12) + labs(y = "Total within sum of squares")

This plot gives us decreases in the within sum of squares value dependent on how many clusters are included in supergroups. The smallest drop appears between 7 and 8, which, according to the original lab doc, was selected for the initial OAS analysis in 2011.

Now that we have a value of 7, we can run our K means using our 7 cluster value.

cluster_7 <- kmeans(x=OAC_Input_PCT_RATIO_IHS_01, centers=7, iter.max=1000000, nstart=10000)

This is also included in the package for this lab, loaded from “./data/cluster_7.Rdata”:

load("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/cluster_7.Rdata")

This cluster_7 value possesses the preloaded output that would come from a K-means cluster analysis. We can view these using:

str(cluster_7)
## List of 9
##  $ cluster     : Named int [1:1584] 7 5 7 5 5 7 5 1 1 4 ...
##   ..- attr(*, "names")= chr [1:1584] "E00032987" "E00032988" "E00032989" "E00032990" ...
##  $ centers     : num [1:7, 1:60] 0.553 0.584 0.677 0.666 0.391 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:60] "k001" "k002" "k003" "k004" ...
##  $ totss       : num 2827
##  $ withinss    : num [1:7] 286 308 250 255 159 ...
##  $ tot.withinss: num 1635
##  $ betweenss   : num 1192
##  $ size        : int [1:7] 259 340 279 334 109 73 190
##  $ iter        : int 6
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

From here, we can aggregate the results of this cluster analysis into a lookup table for easier mapping:

lookup <- data.frame(cluster_7$cluster)
# Add OA codes
lookup$OA <- rownames(lookup)
colnames(lookup) <- c("K_7","OA")
# Recode clusters as letter
lookup$SUPER <- LETTERS[lookup$K_7]

We can look at the distribution of areas in each of our 7 clusters:

table(lookup$K_7)
## 
##   1   2   3   4   5   6   7 
## 259 340 279 334 109  73 190

These values also give us the ability to create a chloropleth map of Liverpool census regions for our K means clusters:

library(rgdal)
## Loading required package: sp
## rgdal: version: 1.5-8, (SVN revision 990)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.0.4, released 2020/01/28
## Path to GDAL shared files: C:/Users/Finnmcoolr/Documents/R/win-library/3.6/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: C:/Users/Finnmcoolr/Documents/R/win-library/3.6/rgdal/proj
## Linking to sp version:1.4-2
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(tmap)
#Import our OA boundary polygons
liverpool_SP <- readOGR("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/Liverpool_OA_2011.geojson")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded datum OSGB_1936 in CRS definition: +proj=tmerc +lat_0=49
## +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +units=m +no_defs
## OGR data source with driver: GeoJSON 
## Source: "C:\Users\Finnmcoolr\Documents\GIS 3\Class Assignments\urban_analytics-master\10_Data_Reduction_Geodemographics\data\Liverpool_OA_2011.geojson", layer: "Liverpool_OA_2011"
## with 1584 features
## It has 1 fields
# Merge lookup
liverpool_SP <- merge(liverpool_SP, lookup, by.x="oa_code",by.y="OA")

m <- tm_shape(liverpool_SP, projection=27700) +
    tm_polygons(col="SUPER", border.col = "black",   palette="Set1",border.alpha = .3, title="Cluster", showNA=FALSE) +
  tm_layout(legend.position = c("left", "bottom"), frame = FALSE, title = "OAS Clusters of Liverpool", title.position = c("right","top"))+
  tm_scale_bar()+ tm_compass(north = 0, type = "4star")

#Create leaflet plot
tmap_leaflet(m)
## Compass not supported in view mode.
## Warning in sp::proj4string(obj): CRS object has comment, which is lost in output
## Warning: The shape liverpool_SP is invalid. See sf::st_is_valid
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
## legend.postion is used for plot mode. Use view.legend.position in tm_view to set the legend position in view mode.

Next, we can create descriptions and scores for the above clusters. We can do this by comparing rates above to the average for the sample area (Liverpool). We do this first with a table of index scores.

# Merge our original data (including our denominators)
LiVOAC_Lookup_Input <- merge(lookup,OAC_Input,by="OA",all.x=TRUE)

# Remove variables in ratio form (population density and SIT)
LiVOAC_Lookup_Input$k007 <- NULL
LiVOAC_Lookup_Input$k035 <- NULL

# Create Aggregations by SuperGroup/Cluster
SuperGroup <-aggregate(LiVOAC_Lookup_Input[,4:78], by=list(LiVOAC_Lookup_Input$SUPER),  FUN=sum)

# Create a data frame that will be used to append the index scores
G_Index <- data.frame(SUPER=LETTERS[1:7])

# Loop loop loop 
for (n in 1:nrow(K_Var)){
  
  num <- paste(K_Var[n,"VariableCode"]) # Get numerator name
  den <- paste(K_Var[n,"Denominator"]) # Get denominator name
  tmp <- data.frame(round((SuperGroup[,num] / SuperGroup[,den]) / (sum(SuperGroup[,num])/sum(SuperGroup[,den]))*100)) # Calculate rounded index score
  colnames(tmp) <- num
  
  G_Index <- cbind(G_Index,tmp) # Add the index calculations
  
  # Remove temporary objects
  remove(list = c("tmp","num","den"))
}

# View our index scores
G_Index
##   SUPER k001 k002 k003 k004 k005 k006 k008 k009 k010 k011 k012 k013 k014 k015
## 1     A   83   91   91  114  147  237   90   92   84  144  106   81   38   40
## 2     B   90  109   84  129  124  121   23   65  164   71  106   60   97   92
## 3     C  125  104  115   98   87   66    8   98  105  102  106   78   71   56
## 4     D  121  129   92  104  108   75   31   95   98  117  107   63   59   23
## 5     E   45   21  184   59   33   41   98  152   42   80   89  171  210  197
## 6     F   35   31   62   32   30   37  933  171   29   33   84  137  280  348
## 7     G  129  113  120   83   73   50   20  112   76  125   77  238  139  195
##   k016 k017 k018 k019 k020 k021 k022 k023 k024 k025 k026 k027 k028 k029 k030
## 1   41   43   59   46  105   60   73   51   73   95    6   51   89   82  155
## 2   48   73   25   32  105   68   29   44  142  130   11  317  221   24   30
## 3   44   47   48   38  104   81   75   55  115  100   58   30   45  185   43
## 4   29   39   49   22  106   41   76   57   79  140    4   66  119  147   11
## 5   79  171  146  275   86  432  186  136  144   14  283   14    9    8  375
## 6  393  415  131  143   86  199  116  148   71   42 1040   31   26  101  199
## 7  305  186  408  408   84  134  291  357   68   70  128   57   59  109  143
##   k031 k032 k033 k034 k036 k037 k038 k039 k040 k041 k042 k043 k044 k045 k046
## 1   70  183   61   92  109  100   62   64   44   57   97   83   83  128   98
## 2  181   11   43   23  123  110   84  143   54  236   80  157   61   53   91
## 3  127   31  128   59   98  113   94  105   64   93  128  115   98   97   90
## 4   87  164   48   69  113  117   68   45   54   65  104   87   89  132  111
## 5   39   67  263  302   53   57  112  227  148   61  105   87  260   79   64
## 6   54   71  231  261   42   47  322   97  480   77   72   42  114   38  178
## 7   48  152  140  159   82   93   84   88  101   39  111   65  117  158  112
##   k047 k048 k049 k050 k051 k052 k053 k054 k055 k056 k057 k058 k059 k060
## 1  101   56  114  112  107  103  126   90   76   93  120   99   84  102
## 2  104   73  115  106  104   87   94   58  119  121   70  125  124   96
## 3  104   98  100  100  104   98  106   80   97  107   92  114  104  104
## 4   95  110  120  121  123  113  126   93   54   82  136   87   68  110
## 5  117   98   48   67   61   78   58  132  209  117   78   82  121   90
## 6   64  295   37   43   25  143   44  258  102   60   76   54  105   75
## 7   95  108   81   90  102  105   89  165   82   80  137   71   86  103

To format this, we can use the reshape package to have a nice, color coded table.

library(reshape2)

We first need to transpose our data from wide to narrow format.

# Transpose data
G_Index_Melt <- melt(G_Index, id.vars="SUPER")
# View the top of our new transposed data
head(G_Index_Melt)
##   SUPER variable value
## 1     A     k001    83
## 2     B     k001    90
## 3     C     k001   125
## 4     D     k001   121
## 5     E     k001    45
## 6     F     k001    35

Finally, we should add more specific variable names to each variable on our final chart. Additionally, we must set color bundles for each cell on the chart to correspond do.

# Recode the index scores into aggregate groupings
G_Index_Melt$band <- ifelse(G_Index_Melt$value <= 80,"< 80",ifelse(G_Index_Melt$value > 80 & G_Index_Melt$value <= 120,"80-120",">120"))

# Add external column with brief variable descriptions.
short <- read.csv("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/OAC_Input_Lookup_short_labels.csv")
G_Index_Melt <- merge(G_Index_Melt,short,by.x="variable",by.y="VariableCode",all.x=TRUE)

# Set created factors so the order makes sense on the ggplot. 
G_Index_Melt$band <- factor(G_Index_Melt$band, levels = c("< 80","80-120",">120"))
G_Index_Melt$VariableDescription <- factor(G_Index_Melt$VariableDescription, levels = short$VariableDescription)

Finally, plot our ending table in ggplot.

library(ggplot2)
p <- ggplot(G_Index_Melt, aes(x=SUPER, y=VariableDescription, label=value, fill=band)) + 
  scale_fill_manual(name = "Band",values = c("#EB753B","#F7D865","#B3D09F")) +
  scale_x_discrete(position = "top") +
  geom_tile(alpha=0.8) +
  geom_text(colour="black")
p

The above map consists of K-Means derived clusters of 7 regions within Liverpool. these clusters step from different normalized values of the above variables. Areas with high or low respective values of these variables are bundled together and have a spatial k means test applied to determine spatial clustering. These clusters show up on our map as clusters A-G. This can inform us on both the spatial makeup of Liverpool, but also the respective connections between normalized high and low values for each of these variables.